neon.data.list <- neonUtilities::loadByProduct(
dpID = "DP1.10022.001",
site = c('ABBY','BARR'),
startdate = "2019-06",
# enddate = "2019-09",
enddate = "2021-02",
token = Sys.getenv("NEON_TOKEN"),
check.size = FALSE)
neon.data.product.id = "DP1.10022.001"
##############################################################################################
# @describeIn map_neon_data_to_ecocomDP This method will retrieve density data for BEETLE from neon.data.product.id DP1.10022.001 from the NEON data portal and map to the ecocomDP
#
##############################################################################################
map_neon.ecocomdp.10022.001.001 <- function(
neon.data.list,
neon.data.product.id = "DP1.10022.001",
...){
#NEON target taxon group is BEETLES
neon_method_id <- "neon.ecocomdp.10022.001.001"
# make sure neon.data.list matches the method
if(!any(grepl(
neon.data.product.id %>% gsub("^DP1\\.","",.) %>% gsub("\\.001$","",.),
names(neon.data.list)))) stop(
"This dataset does not appeaer to be sourced from NEON ",
neon.data.product.id,
" and cannot be mapped using method ",
neon_method_id)
# get data
beetles_raw <- neon.data.list
# helper function to calculate mode of a column/vector
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
### Clean Up Sample Data ###
# start with the fielddata table, which describes all sampling events
data_beetles <- tidyr::as_tibble(beetles_raw$bet_fielddata) %>%
dplyr::filter(sampleCollected == "Y") %>% #there's an entry for every trap, whether or not they got samples, only want ones with samples
dplyr::select(sampleID, domainID, siteID, plotID, namedLocation,
trapID, setDate, collectDate, eventID, trappingDays,
release, publicationDate,
plotType, samplingProtocolVersion, remarks) %>%
# eventID's are inconsistently separated by periods, so we remove them
dplyr::mutate(eventID = stringr::str_remove_all(eventID, "[.]")) %>%
#calculate a new trapdays column
dplyr::mutate(trappingDays = lubridate::interval(lubridate::ymd(setDate),
lubridate::ymd(collectDate)) %/%
lubridate::days(1))
#Find the traps that have multiple collectDates/bouts for the same setDate
#need to calculate their trap days from the previous collectDate, not the setDate
data_adjTrappingDays <- data.frame()
try({
data_adjTrappingDays <- data_beetles %>%
dplyr::select(namedLocation, trapID, setDate, collectDate, trappingDays, eventID,
release, publicationDate,
plotType, samplingProtocolVersion, remarks) %>%
dplyr::group_by_at(
dplyr::vars(-collectDate, -trappingDays, -eventID)) %>%
dplyr::filter(
dplyr::n_distinct(collectDate) > 1) %>%
dplyr::group_by(namedLocation, trapID, setDate) %>%
dplyr::mutate(diffTrappingDays = trappingDays - min(trappingDays)) %>%
dplyr::mutate(adjTrappingDays = dplyr::case_when(
diffTrappingDays == 0 ~ trappingDays,
TRUE ~ diffTrappingDays)) %>%
dplyr::select(-c(trappingDays, diffTrappingDays))
}, silent = TRUE)
if(nrow(data_adjTrappingDays)>0){
data_beetles <- data_beetles %>%
#update with adjusted trapping days where needed
dplyr::left_join(data_adjTrappingDays)
}else{
data_beetles$adjTrappingDays <- NA_real_
}
data_beetles <- data_beetles %>%
#update with adjusted trapping days where needed
# dplyr::left_join(adjTrappingDays) %>%
dplyr::mutate(
trappingDays = dplyr::case_when(
!is.na(adjTrappingDays) ~ adjTrappingDays,
TRUE ~ trappingDays
)) %>%
dplyr::select(-adjTrappingDays, -setDate) %>%
# for some eventID's (bouts) collection happened over two days,
# change collectDate to the date that majority of traps were collected on
dplyr::group_by(eventID) %>%
dplyr::mutate(collectDate = Mode(collectDate)) %>%
dplyr::ungroup() %>%
# there are also some sites for which all traps were set and collect on the same date, but have multiple eventID's
# we want to consider that as all one bout so we create a new ID based on the site and collectDate
tidyr::unite(boutID, siteID, collectDate, remove = FALSE) %>%
dplyr::select(-eventID) %>%
# join with bet_sorting, which describes the beetles in each sample
dplyr::left_join(beetles_raw$bet_sorting %>%
# only want carabid samples, not bycatch
dplyr::filter(sampleType %in% c("carabid", "other carabid")) %>%
dplyr::select(uid,
sampleID, subsampleID, sampleType, taxonID,
scientificName, taxonRank, identificationReferences,
nativeStatusCode,
individualCount),
by = "sampleID") %>%
dplyr::filter(!is.na(subsampleID)) #even though they were marked a sampled, some collection times don't acutally have any samples
### Clean up Taxonomy of Samples ###
#Some samples were pinned and reidentified by more expert taxonomists, replace taxonomy with their ID's (in bet_parataxonomist) where available
data_pin <- data_beetles %>%
dplyr::left_join(beetles_raw$bet_parataxonomistID %>%
dplyr::select(subsampleID, individualID, taxonID, scientificName,
taxonRank,
nativeStatusCode),
by = "subsampleID") %>%
dplyr::mutate_if(is.factor, as.character) %>%
dplyr::mutate(taxonID = ifelse(is.na(taxonID.y), taxonID.x, taxonID.y)) %>%
dplyr::mutate(taxonRank = ifelse(is.na(taxonRank.y), taxonRank.x, taxonRank.y)) %>%
dplyr::mutate(nativeStatusCode = ifelse(is.na(nativeStatusCode.y),
nativeStatusCode.x, nativeStatusCode.y)) %>%
dplyr::mutate(scientificName = ifelse(is.na(scientificName.y), scientificName.x, scientificName.y)) %>%
dplyr::mutate(identificationSource = ifelse(is.na(scientificName.y), "sort", "pin")) %>%
dplyr::select(-ends_with(".x"), -ends_with(".y"))
# some subsamples weren't fully ID'd by the pinners, so we have to recover the unpinned-individuals
lost_indv <- data_pin %>%
dplyr::filter(!is.na(individualID)) %>%
dplyr::group_by(subsampleID, individualCount) %>%
dplyr::summarise(n_ided = dplyr::n_distinct(individualID)) %>%
dplyr::filter(n_ided < individualCount) %>%
dplyr::mutate(unidentifiedCount = individualCount - n_ided) %>%
dplyr::select(subsampleID, individualCount = unidentifiedCount) %>%
dplyr::left_join(dplyr::select(data_beetles, -individualCount), by = "subsampleID") %>%
dplyr::mutate(identificationSource = "sort")
# add unpinned-individuals back to the pinned id's, adjust the individual counts so pinned individuals have a count of 1
data_pin <- data_pin %>%
dplyr::mutate(individualCount = ifelse(identificationSource == "sort", individualCount, 1)) %>%
dplyr::bind_rows(lost_indv)
#Join expert ID's to beetle dataframe
data_expert <- dplyr::left_join(data_pin,
dplyr::select(beetles_raw$bet_expertTaxonomistIDProcessed,
individualID,taxonID,scientificName,
taxonRank,
nativeStatusCode),
by = 'individualID', na_matches = "never") %>%
dplyr::distinct()
#Update with expert taxonomy where available
data_expert <- data_expert %>%
dplyr::mutate_if(is.factor, as.character) %>%
dplyr::mutate(taxonID = ifelse(is.na(taxonID.y), taxonID.x, taxonID.y)) %>%
dplyr::mutate(taxonRank = ifelse(is.na(taxonRank.y), taxonRank.x, taxonRank.y)) %>%
dplyr::mutate(nativeStatusCode = ifelse(is.na(nativeStatusCode.y),
nativeStatusCode.x, nativeStatusCode.y)) %>%
dplyr::mutate(scientificName = ifelse(is.na(scientificName.y), scientificName.x, scientificName.y)) %>%
dplyr::mutate(identificationSource = ifelse(is.na(scientificName.y), identificationSource, "expert")) %>%
dplyr::select(-ends_with(".x"), -ends_with(".y")) %>%
dplyr::group_by(individualID) %>%
dplyr::filter(dplyr::n() == 1 | is.na(individualID)) %>% #remove individuals with more than one ID, retain NA individualID's
dplyr::ungroup()
#Get raw counts table
beetles_counts <- data_expert %>%
dplyr::select(-c(subsampleID, sampleType, identificationSource, individualID)) %>%
dplyr::group_by_at(dplyr::vars(-individualCount)) %>%
dplyr::summarise(count = sum(individualCount)) %>%
dplyr::ungroup() %>%
dplyr::distinct()
# making tables ----
# Observation Tables
# All individuals of the same species collected at the same time/same location are considered the same observation, regardless of how they were ID'd
my_package_id <- paste0(neon_method_id, ".", format(Sys.time(), "%Y%m%d%H%M%S"))
table_observation_raw <- beetles_counts %>%
dplyr::distinct() %>%
dplyr::rename(
location_id = namedLocation,
abundance = count,
datetime = collectDate,
taxon_id = taxonID) %>%
dplyr::mutate(
package_id = my_package_id,
observation_id = paste0("obs_",1:nrow(.)),
# event_id = observation_id,
variable_name = "abundance",
value = abundance/trappingDays,
unit = "count per trap day") %>%
dplyr::rowwise() %>%
dplyr::mutate(
event_id = sampleID
# event_id = paste0(location_id, "_", trapID, "_", gsub("^(?i)[a-z]{4}_","",boutID)) #this is same as 'sampleID'
) %>%
dplyr::ungroup()
table_observation <- table_observation_raw %>%
dplyr::select(observation_id,
event_id,
package_id,
location_id,
datetime,
taxon_id,
variable_name,
value,
unit) %>%
dplyr::distinct()
table_observation_ancillary <- make_neon_ancillary_observation_table(
obs_wide = table_observation_raw,
ancillary_var_names = c(
"observation_id",
# "neon_event_id",
"sampleID",
"boutID",
"trapID",
"trappingDays",
"release", "publicationDate",
"samplingProtocolVersion",
"remarks",
"nativeStatusCode")) %>%
# add units where appropriate
dplyr::mutate(
unit = dplyr::case_when(
variable_name == "trappingDays" ~ "days",
TRUE ~ NA_character_
)
)
# location ----
# get relevant location info from the data, use neon helper functions
# to make location and ancillary location tables
table_location_raw <- beetles_raw$bet_fielddata %>%
dplyr::select(domainID, siteID, plotID, namedLocation,
decimalLatitude, decimalLongitude, elevation,
plotType, nlcdClass, geodeticDatum) %>%
dplyr::distinct()
table_location <- make_neon_location_table(
loc_info = table_location_raw,
loc_col_names = c("domainID", "siteID", "plotID", "namedLocation"))
table_location_ancillary <- make_neon_ancillary_location_table(
loc_info = table_location_raw,
loc_col_names = c("domainID", "siteID", "plotID", "namedLocation"),
ancillary_var_names <- c("namedLocation",
"plotType", "nlcdClass", "geodeticDatum"))
# taxonomy ----
# create a taxon table, which describes each taxonID that appears in the data set
# start with inv_taxonomyProcessed
table_taxon <- beetles_counts %>%
# keep only the coluns listed below
dplyr::select(taxonID, taxonRank, scientificName, identificationReferences) %>%
# remove rows with duplicate information
dplyr::distinct() %>%
# rename some columns
dplyr::rename(taxon_id = taxonID,
taxon_rank = taxonRank,
taxon_name = scientificName,
authority_system = identificationReferences) %>%
# concatenate different references for same taxonID
dplyr::group_by(taxon_id, taxon_rank, taxon_name) %>%
dplyr::summarise(
authority_system = paste(unique(c(authority_system)), collapse = "; "))
# data summary ----
# make dataset_summary -- required table
years_in_data <- table_observation$datetime %>% lubridate::year()
years_in_data %>% ordered()
table_dataset_summary <- data.frame(
package_id = table_observation$package_id[1],
original_package_id = neon.data.product.id,
length_of_survey_years = max(years_in_data) - min(years_in_data) + 1,
number_of_years_sampled = years_in_data %>% unique() %>% length(),
std_dev_interval_betw_years = years_in_data %>%
unique() %>% sort() %>% diff() %>% stats::sd(),
max_num_taxa = table_taxon$taxon_id %>% unique() %>% length()
)
# return tables ----
out_list <- list(
location = table_location,
location_ancillary = table_location_ancillary,
taxon = table_taxon,
observation = table_observation,
observation_ancillary = table_observation_ancillary,
dataset_summary = table_dataset_summary)
return(out_list)
} # end of function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.